library(tidyverse)
library(readr)
library(ggridges)
library(tidycensus)

Reading in the data

First you will want to read in your data. You can do this using read_csv. To open the data you can “uncomment” (delete the hash tag #) to view the data file. Note that you will need to “recomment” this out in order to knit your file again.

Describe the structure of the data

In a tidy dataset, each row is an observation, each column is a variable, and each cell is a value.

In this case each row represents an individual who was sentenced in the federal district court system in the U.S. in one of the 94 districts in the U.S.

When we start with a dataset we will want to think about and explore some key questions related to who is in our dataset, what the sentence was, when the individual was sentenced and where the sentence occurred.

Let’s inspect our variables which are in the columns.

names(us_sent)
##  [1] "sentence_length"             "age"                        
##  [3] "sex"                         "educ"                       
##  [5] "year"                        "guilty_plea"                
##  [7] "base_chapter2_adjustments"   "base_chapter2_3_adjustments"
##  [9] "all_adjustments"             "grid_cell"                  
## [11] "mandatory_min"               "gov_departures"             
## [13] "district"                    "race"                       
## [15] "criminal_history"
summary(us_sent)
##  sentence_length       age             sex              educ      
##  Min.   :  0.00   Min.   :16.00   Min.   :0.0000   Min.   :1.000  
##  1st Qu.: 12.03   1st Qu.:28.00   1st Qu.:0.0000   1st Qu.:1.000  
##  Median : 41.00   Median :35.00   Median :0.0000   Median :3.000  
##  Mean   : 62.96   Mean   :36.96   Mean   :0.1681   Mean   :3.101  
##  3rd Qu.: 87.00   3rd Qu.:44.00   3rd Qu.:0.0000   3rd Qu.:5.000  
##  Max.   :470.00   Max.   :97.00   Max.   :1.0000   Max.   :6.000  
##       year       guilty_plea     base_chapter2_adjustments
##  Min.   :2006   Min.   :0.0000   Min.   :-8.00            
##  1st Qu.:2009   1st Qu.:0.0000   1st Qu.:16.00            
##  Median :2013   Median :0.0000   Median :24.00            
##  Mean   :2013   Mean   :0.0454   Mean   :23.21            
##  3rd Qu.:2017   3rd Qu.:0.0000   3rd Qu.:30.00            
##  Max.   :2020   Max.   :1.0000   Max.   :53.00            
##  base_chapter2_3_adjustments all_adjustments  grid_cell         mandatory_min  
##  Min.   : 1.00               Min.   : 1.00   Length:518719      Mode :logical  
##  1st Qu.:16.00               1st Qu.:14.00   Class :character   FALSE:453517   
##  Median :24.00               Median :21.00   Mode  :character   TRUE :65202    
##  Mean   :23.52               Mean   :21.31                                     
##  3rd Qu.:30.00               3rd Qu.:28.00                                     
##  Max.   :64.00               Max.   :43.00                                     
##  gov_departures    district             race           criminal_history
##  Mode :logical   Length:518719      Length:518719      Min.   :1.000   
##  FALSE:417903    Class :character   Class :character   1st Qu.:1.000   
##  TRUE :100816    Mode  :character   Mode  :character   Median :2.000   
##                                                        Mean   :2.655   
##                                                        3rd Qu.:4.000   
##                                                        Max.   :6.000
str(us_sent)
## spc_tbl_ [518,719 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ sentence_length            : num [1:518719] 0.46 90 65 87 60 62 84 120 53 0 ...
##  $ age                        : num [1:518719] 36 21 35 26 30 23 29 26 45 36 ...
##  $ sex                        : num [1:518719] 0 0 0 0 0 0 0 0 0 1 ...
##  $ educ                       : num [1:518719] 3 3 1 1 3 5 1 3 3 3 ...
##  $ year                       : num [1:518719] 2006 2006 2006 2006 2006 ...
##  $ guilty_plea                : num [1:518719] 0 0 0 0 0 0 0 0 0 0 ...
##  $ base_chapter2_adjustments  : num [1:518719] 38 18 28 32 28 28 28 14 20 20 ...
##  $ base_chapter2_3_adjustments: num [1:518719] 38 18 28 32 28 28 26 14 20 20 ...
##  $ all_adjustments            : num [1:518719] 35 15 25 29 26 25 23 34 17 17 ...
##  $ grid_cell                  : chr [1:518719] "XFOLSOR35XCRHISSR5" "XFOLSOR15XCRHISSR4" "XFOLSOR25XCRHISSR2" "XFOLSOR29XCRHISSR3" ...
##  $ mandatory_min              : logi [1:518719] FALSE TRUE FALSE FALSE FALSE TRUE ...
##  $ gov_departures             : logi [1:518719] TRUE FALSE FALSE FALSE FALSE FALSE ...
##  $ district                   : chr [1:518719] "Dist of Columbia" "Dist of Columbia" "Dist of Columbia" "Dist of Columbia" ...
##  $ race                       : chr [1:518719] "white" "black" "black" "black" ...
##  $ criminal_history           : num [1:518719] 5 4 2 3 5 1 5 6 4 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   sentence_length = col_double(),
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   educ = col_double(),
##   ..   year = col_double(),
##   ..   guilty_plea = col_double(),
##   ..   base_chapter2_adjustments = col_double(),
##   ..   base_chapter2_3_adjustments = col_double(),
##   ..   all_adjustments = col_double(),
##   ..   grid_cell = col_character(),
##   ..   mandatory_min = col_logical(),
##   ..   gov_departures = col_logical(),
##   ..   district = col_character(),
##   ..   race = col_character(),
##   ..   criminal_history = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
glimpse(us_sent)
## Rows: 518,719
## Columns: 15
## $ sentence_length             <dbl> 0.46, 90.00, 65.00, 87.00, 60.00, 62.00, 8…
## $ age                         <dbl> 36, 21, 35, 26, 30, 23, 29, 26, 45, 36, 44…
## $ sex                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ educ                        <dbl> 3, 3, 1, 1, 3, 5, 1, 3, 3, 3, 3, 5, 1, 3, …
## $ year                        <dbl> 2006, 2006, 2006, 2006, 2006, 2006, 2006, …
## $ guilty_plea                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ base_chapter2_adjustments   <dbl> 38, 18, 28, 32, 28, 28, 28, 14, 20, 20, 30…
## $ base_chapter2_3_adjustments <dbl> 38, 18, 28, 32, 28, 28, 26, 14, 20, 20, 30…
## $ all_adjustments             <dbl> 35, 15, 25, 29, 26, 25, 23, 34, 17, 17, 27…
## $ grid_cell                   <chr> "XFOLSOR35XCRHISSR5", "XFOLSOR15XCRHISSR4"…
## $ mandatory_min               <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FA…
## $ gov_departures              <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ district                    <chr> "Dist of Columbia", "Dist of Columbia", "D…
## $ race                        <chr> "white", "black", "black", "black", "black…
## $ criminal_history            <dbl> 5, 4, 2, 3, 5, 1, 5, 6, 4, 1, 1, 1, 2, 1, …

Who

A key motivation for this work is exploring how our personal and racial identities influence the judicial sentencing. In a fair, democratic system this identity or perceived identity should not affect how we are treated by the justice system.

In this dataset, we have four variables that are related to identity: age, sex, educ: education, and race. It is important to note that these characteristics are not independent of one another but can intersect and interact with one another. This combined effect is known as intersectionality. The Center for Intersectional Justice describes the concept of intersectionality as: “the ways in which systems of inequality based on gender, race, ethnicity, sexual orientation, gender identity, disability, class and other forms of discrimination “intersect” to create unique dynamics and effects” (Ref: Center for Intersectional Justice, July 20, 2023).

Sex

Sex has been coded as a binary variable 0 and 1, where 0 is “Male” and 1 is “Female”.

us_sent %>%
  distinct(sex)
## # A tibble: 2 × 1
##     sex
##   <dbl>
## 1     0
## 2     1

We can recode this variable using mutate and case_when

To add

  • Add how sex was defined (sex is defined in pre-sentencing report that comes out of the investigation done by the probation office. This goes to the attorneys and also comes out of an interview with the individual. It should be noted that there are only two categories, so it is likely that there might be only two options possible). See Background section or link to official report for more information.
  • Discuss how a binary variable will not reflect sentenced individuals gender identity and excludes several groups. It is also limiting in nature.
  • Discuss difference between sex and gender
  • Discuss who has defined this variable and why this is problematic
  • Discuss the limitations and how this affects the analysis and inferences we can make.
  • Update sex label with more appropriate label based on Background research.
us_sent %>%
  mutate(sex = case_when(sex == 0 ~ "Male",
                         sex == 1 ~ "Female")) %>%
  ggplot() +
  geom_bar(aes(x = sex, fill = sex)) +
  labs(x = "Sex*",
       title = "Sex* of Sentenced Individuals", 
       y = "Number of Individuals",
       fill = "Sex*") +
  scale_fill_viridis_d()

Race

us_sent %>%
  distinct(race)
## # A tibble: 4 × 1
##   race    
##   <chr>   
## 1 white   
## 2 black   
## 3 other   
## 4 hispanic
us_sent %>%
  mutate(race = case_when(race == "other" ~ "ari",
                         TRUE ~ race)) %>%
  ggplot() +
  geom_bar(aes(x = fct_infreq(race), fill = race)) +
  labs(x = "Race*",
       title = "Race* of Sentenced Individuals", 
       y = "Number of Individuals",
       fill = "Race*") +
  scale_fill_viridis_d()

Race and Sex

us_sent %>%
  count(race, sex)
## # A tibble: 8 × 3
##   race       sex      n
##   <chr>    <dbl>  <int>
## 1 black        0 160152
## 2 black        1  23500
## 3 hispanic     0  85305
## 4 hispanic     1  18603
## 5 other        0  22036
## 6 other        1   5654
## 7 white        0 164017
## 8 white        1  39452
us_sent %>%
  mutate(race = case_when(race == "other" ~ "ari",
                         TRUE ~ race)) %>%
  mutate(sex = case_when(sex == 0 ~ "Male",
                         sex == 1 ~ "Female")) %>%
  ggplot() +
  geom_bar(aes(x = fct_infreq(race), fill = sex)) +
  labs(x = "Race*",
       title = "Race* and Sex* of Sentenced Individuals", 
       y = "Number of Individuals",
       fill = "Sex*")  +
  scale_fill_viridis_d()

Proportional Race and Sex

We can also create a proportional bar plot

us_sent %>%
  mutate(race = case_when(race == "other" ~ "ari",
                         TRUE ~ race)) %>%
  mutate(sex = case_when(sex == 0 ~ "Male",
                         sex == 1 ~ "Female")) %>%
  ggplot() +
  geom_bar(aes(x = fct_infreq(race), fill = sex), position = "fill") +
  labs(x = "Race*",
       title = "Race* and Sex* of Sentenced Individuals", 
       y = "Number of Individuals",
       fill = "Sex*")  +
  scale_fill_viridis_d()

#### Age

We might want to explore what the age of different individuals is across districts. Let’s explore the districts of Maine, Rhode Island, and Vermont.

us_sent %>%
  filter(district %in% c("Maine", "Rhode Island", "Vermont"))
## # A tibble: 5,257 × 15
##    sentence_length   age   sex  educ  year guilty_plea base_chapter2_adjustments
##              <dbl> <dbl> <dbl> <dbl> <dbl>       <dbl>                     <dbl>
##  1              10    28     0     1  2006           0                        13
##  2              18    20     0     3  2006           0                        18
##  3             184    22     0     1  2006           0                        25
##  4              44    37     0     3  2006           0                        16
##  5              42    27     0     5  2006           0                        20
##  6             210    45     0     5  2006           1                        32
##  7             108    27     0     3  2006           0                        31
##  8             130    22     0     1  2006           0                        23
##  9               0    38     0     6  2006           0                        10
## 10              19    27     0     3  2006           0                        16
## # ℹ 5,247 more rows
## # ℹ 8 more variables: base_chapter2_3_adjustments <dbl>, all_adjustments <dbl>,
## #   grid_cell <chr>, mandatory_min <lgl>, gov_departures <lgl>, district <chr>,
## #   race <chr>, criminal_history <dbl>

To add: - what do we divide by? - You may look at this plot and say that whites are sentenced at the federal district court more than black individuals. This is where it is important to look at the population in each district.

Future Directions

Bringing in the spatial district files

What

How does sentence length correlate with criminal history?

Sentence lengths and how do they relate to policy?

When we are exploring the distribution of a dataset can we can use a number of different plots. To better understand sentence length data, we’ll display a histogram for this quantitative variable. A histogram gives us a visual representation of the frequency of values. With R, we can change the width of each bin or choose a number of bins, and then the plot shows us how many sentences fell within each bin range.

Histogram
ggplot(us_sent) +
  geom_histogram(aes(x = sentence_length), binwidth = 12) +
  labs(x = "Sentence length in months", y = "Number of individuals")  +
  geom_vline(aes(xintercept = 12), color = "red", linetype = 2) +
  geom_vline(aes(xintercept = 60), color = "red", linetype = 2) +
   geom_vline(aes(xintercept = 120), color = "red", linetype = 2) +
  geom_vline(aes(xintercept = 240), color = "red", linetype = 2)

##### Histogram

ggplot(us_sent) +
  geom_histogram(aes(x = sentence_length/12), binwidth = 1) +
  labs(x = "Sentence length in years", y = "Number of individuals") +
  geom_vline(aes(xintercept = 1), color = "red", linetype = 2) +
  geom_vline(aes(xintercept = 5), color = "red", linetype = 2) +
  geom_vline(aes(xintercept = 10), color = "red", linetype = 2) +
  geom_vline(aes(xintercept = 20), color = "red", linetype = 2) +
  labs(title = "Distribution of number of individuals by ")

We observe a high frequency over 0, meaning that many individuals who are convicted may in fact receive a sentence of 0 months. At the other extreme, we see cases corresponding to 470 months, representing individuals who either have been given a long sentence, meaning 39 years or possibly a life sentence.

[could further discuss skew, peaks and relate to sentencing table, etc.]

Violin Plot

Violin plots are another way of showing the distribution data.

ggplot(us_sent) +
  geom_violin(aes(x = as.factor(criminal_history), y = sentence_length, fill = as.factor(criminal_history))) +
  labs(fill = "Criminal History", y = "Sentence length in months", x = "Criminal History")

##### Ridge plot

ggplot(us_sent) +
  geom_density_ridges(aes(y = as.factor(criminal_history), 
                          x = sentence_length, 
                          fill = as.factor(criminal_history))) +
  labs(title = "Distribution of sentence length in months by criminal history",
       subtitle = "Black dashed lines indicate 1, 5, 10, 15, and 20 year sentences",
    fill = "Criminal History", 
       x = "Sentence length in months", 
       y = "Criminal History") +
  geom_vline(aes(xintercept = 12), color = "black", linetype = 2) +
  geom_vline(aes(xintercept = 60), color = "black", linetype = 2) +
   geom_vline(aes(xintercept = 120), color = "black", linetype = 2) +
  geom_vline(aes(xintercept = 180), color = "black", linetype = 2) +
  geom_vline(aes(xintercept = 240), color = "black", linetype = 2) +
  scale_fill_viridis_d()
## Picking joint bandwidth of 5.47

What is the relationship between base_chapter2_3_adjustment and sentence_length?

ggplot(us_sent) +
  geom_point(aes(y = sentence_length, 
                 x = all_adjustments, color = as.factor(criminal_history)), alpha = 0.01) +
  labs(x = "All Adjustments", 
       y = "Sentence length in months", 
       color = "Criminal History") +
  facet_wrap(~ race) +
  scale_color_viridis_d()

ggplot(us_sent) +
  geom_point(aes(y = sentence_length, 
                 x = all_adjustments, color = as.factor(criminal_history)), alpha = 0.01) +
  labs(x = "All Adjustments", 
       y = "Sentence length in months", 
       color = "Criminal History") +
  scale_color_viridis_d()

How does the age range vary with criminal history?

ggplot(us_sent) +
  geom_density_ridges(aes(y = as.factor(criminal_history), 
                          x = age, 
                          fill = as.factor(criminal_history))) +
  labs(fill = "Criminal History", 
       x = "Age of individual in years", 
       y = "Criminal History")
## Picking joint bandwidth of 0.989

When

Where

Let’s check what districts we have in the data. We can do this using distinct(district).

us_sent %>%
  distinct(district)
## # A tibble: 93 × 1
##    district        
##    <chr>           
##  1 Dist of Columbia
##  2 Maine           
##  3 Massachusetts   
##  4 New Hampshire   
##  5 Puerto Rico     
##  6 Rhode Island    
##  7 Connecticut     
##  8 New York East   
##  9 New York North  
## 10 New York South  
## # ℹ 83 more rows

You’ll notice that certain states are a single district on their own. Other larger states are split into several districts (e.g. New York East, New York North).

Q: If you live in the United States, can you identify which district you live in?

To Do Add a link to a reference where someone could also find which district their town/city is in.

We can use distinct to figure out what distinct districts we have in our dataset, but we are also interested in finding out how many sentences were made in each district and we will want to think about this question in relation to the population of those districts, which can be quite nuanced and we will come back to this later.

Our goals in this next section are to think about ways we can explore the question “Where did those sentences occur” in a visual way.

Learning aims

  • introduce factors as a data type
  • explain why it might be better to put our names on the y axis instead of the x-axis for many categories and those with longer names. This makes it more readable.
  • explain that automatically categorical data will be placed in alphabetical order.
  • we can reconfigure the graph using functions from the forcats package.
  • This is a good example for when you might want to order it in terms of frequency (i.e. the count) using fct_infreq
  • We may also want to reverse the order to see the districts which have the most sentences at the top of our plot using fct_rev.

Number of individuals sentences across districts

Base Bar Plot
ggplot(us_sent) +
  geom_bar(aes(x = district))

District on the y-axis
ggplot(us_sent) +
  geom_bar(aes(y = district))

Ordered by number of sentences
ggplot(us_sent) +
  geom_bar(aes(y = fct_infreq(district)))

Ordering from high to low
ggplot(us_sent) +
  geom_bar(aes(y = fct_rev(fct_infreq(district))))

Add title and axes labels
ggplot(us_sent) +
  geom_bar(aes(y = fct_rev(fct_infreq(district)))) +
  labs(title = "Number of individuals sentenced at the federal district court level for each district from x to x",
       y = "Federal District Court",
       x = "Number of individuals")

Making the plot more readable with subsetting
ggplot(us_sent) +
  geom_bar(aes(y = fct_rev(fct_infreq(district)))) +
  labs(title = "Number of individuals sentenced at the federal district court level for each district from x to x",
       y = "Federal District Court",
       x = "Number of individuals")

ggplot(us_sent) +
  geom_bar(aes(y = fct_infreq(district))) +
  labs(title = "Number of individuals sentenced at the federal district court level for each district from x to x",
       y = "Federal District Court",
       x = "Number of individuals")

Exploring the census data

To Do:

  • If we want to get out the district populations, we will probably need to download the data by the county level.
  • Decision points: we could download the data in categories by age and sex and race: e.g. 5-9, 10-14 etc. This might get a little
# apply unique census api key
census_api_key("5177724b01a7fe4714097e711cb95230c37cfce7", overwrite = TRUE)

# import census data 
## guide to spatial units: https://api.census.gov/data/2016/acs/acs5/geography.html
## variable of interest --> population
## vars <- load_variables(year = 2013,
                      # dataset = "acs5",
                      # cache = TRUE)

# B02001_001: Total
# B03002_003: White alone (Not Hispanic or Latino)
# B03002_004 Black or African American alone (Not Hispanic or Latino)
# B03002_012: Hispanic or Latino
# B03002_005: Native American alone (Not Hispanic or Latino)
# B03002_006: Asian alone (Not Hispanic or Latino)
# B03002_007: Native Hawaiian or Pacific Islander alone (Not Hispanic or Latino)
# B03002_009: Multiple Races (Not Hispanic or Latino)
# B03002_008: Other (Not Hispanic or Latino)

#census_place_df <- get_acs(geography = "state", variables = c("B01003_001E"), geometry = TRUE, year = 2010)